


insheet using "raw data\combine4_2015_0713_extract.csv", comma clear
rename sa_id sa
gen pdp = 0
replace pdp = 1 if status_name == "Active"
save "stata data/pdp_status_file_cleaned", replace


use "stata data/TOU_demographic 2", clear
merge 1:1 sa using "stata data/pdp_status_file_cleaned" 
drop if _merge == 2
drop _merge



format sa prem per sp_id acct %15.0g


gen H_id= regexm(rs,"H")
drop if H_id == 0


drop if lat_prm == 0
drop if long_prm == 0


//the goal here is to colapse people I feel comfortable with being on the same new "account"
///Here I want to identify people who all had their smartmeters installed on the same date
//so I get the mean date, and tag those that had the same install date and then count them
//if the ones with the same install date are the same count as the group, then I 
//feel like I can collapse them. 


gegen group_dup = group(lat_prm long_prm per_id)

//not worried about single row customers/premisess here
//so drop them b/c not a concern
duplicates tag group_dup, gen(how_many_dups)
drop if how_many_dups == 0




//first find people that have the same install date. A necessary condition to collapse 
egen mean_sp_min_date = mean(sp_min_date), by(group_dup)

gen install_date_match = 0
replace install_date_match = 1 if mean_sp_min_date == sp_min_date


//figure out meters that share with a big meter. Say A10 or E19.
//I want to tag them for later on so I can do sensitivity
gen big_meter = 1
replace big_meter = 0 if rs == "HA1"
replace big_meter = 0 if rs == "HA1X"

egen share_facility_big_meter = max(big_meter), by(group_dup)


//this is a big choice, but I think a good one
//what it does is drop out the big metered guys
//in these next steps, I do the following:
//1) count how many rows youre install date matches for within group_dup
//2) Count how many times your group_dup shows up in the data
//IF these are the same, then I say we can collapse down to that group_dup level

//the reason I chose to drop big_meter ones is I stil lwant to be able to 
//collapse down to the A1 level even if there is an E19 on premise. 
//so for example, 1 location, 3 A1 and 1 E19. If the 3A1s were all upgraded on the same date
//this allows me to collapse down to their level. 


keep if big_meter == 0

egen match_count = sum(install_date_match), by(group_dup)

duplicates tag group_dup, gen(group_dup_count)
replace group_dup_count = group_dup_count+1 //need to add 1 to this to account for duplicates tag works


gen multiple_collapse_ok = 0
replace multiple_collapse_ok = 1 if group_dup_count == match_count 

order group_dup share bin sp_min_date
sort group_dup


gen many_meter_issue = 0
replace many_meter_issue = 1 if multiple_collapse_ok == 0


keep sa multiple_collapse group_dup many_meter_issue share_facility_big_meter

save "stata data/multiple meter summary information", replace



use "stata data\A1 PDP 2014 changeover for regs with weather", clear

drop if sa == 6638700673 //odd outlier 

// add in stuff on collapsing multiple saids that are on the same 
//premise or by same person on same location. Do this before any data screens to work the best
merge m:1 sa using "stata data/multiple meter summary information"
replace multiple = 0 if multiple == .
replace share_facility = 0 if share_fac == .
replace many_meter_issue = 0 if many_meter_issue  == .
drop if _merge == 2
drop _merge


//Variable definitions
/////////dup group
//defined as egen group_dup = group(lat_prm long_prm per_id)
//it's basically everything at the same premise with the same person_id paying for the bill. It's how I define groups for collasping

/////////Many_meter_issue
//if someone has a many_meter_issue it means they have meters that were upgraded on different days so are in wildly different bins
//for example a group_dup could be in -3 and 40. Now the 40 won't be in the data, so This tag tells me it exists and I might want to ignore it
//as part of a robustness check

/////////Multiple_collapse_ok
//this means that I know I can collapse across these saids. These meters within the dup_group that all were installed on the same date and
///they were the ONLY meters there. 

///////share_facility_big_meter
//means you share a facility with a large meter. Like an A1 with an E19 or A10. 
//This can exist with a multiple_collapse_ok and many_meter_issue. Means that a meter shares something with a big place. 



save "temp/temp3", replace

use "temp/temp3", clear


//drop the people where 1 meter is on PDP and one isn't. Could come back and include if I want later
egen status_multiple = mean(pdp_always_on), by(group_dup)
replace status_multiple = 0 if multiple == 0
drop if inrange(status_multiple,.1,.9)
drop status_multiple

//this gives a group_dup to all the ones that didn't merge in from the other document
gen double group_dup_old =group_dup //I do it this way instead of rename due to stupid rounding issues
drop group_dup
format group_dup_old sa %15.2g
order group_dup*

replace  group_dup_old = sa if group_dup_old == .

egen double group_dup = group(group_dup_old)
drop group_dup_old

format sa sp %12.0g

order group_dup multiple share_facility_big_meter	many_meter_issue


sort group_dup


//Now I need a collapsed kwh value so I can dulplicates drop later
//Do the collapse for when there are multiples I can collapse, otherwise get your original kwh value
egen kwh_collapse = sum(kwh) if multiple == 1, by(group_dup date hour)
replace kwh_collapse = kwh if kwh_collapse ==. & multiple == 0 //redundant last two conditions, only need 1, but to be sure


//the temp_f should be about the same. A few times, they varried, so this just replaces temp_f with an avg across the group_dup
egen temp_f_mean = mean(temp_f) if multiple == 1, by(group_dup hour date) 
replace temp_f_mean  = temp_f if temp_f_mean == .
drop temp_f 



save "temp/temp4", replace


use "temp/temp4", clear

drop if apct > 110
//this deals with the duplicates going on
gduplicates drop

save "temp/only_1_sa", replace
use "temp/only_1_sa", clear

//This is intersting. I want an SA to merge things in on
//Basically to come up with a new sa_id for the multiple merge people
//I do it by taking the person with the max kWh and using their sa. 
egen double sum_kwh = sum(kwh), by(sa)
egen double max_sum_kwh = max(sum_kwh), by(group_dup)

gen eventual_sa = 0
replace eventual_sa = 1 if sum_kwh == max_sum_kwh 

gen double new_sa_temp = .
replace new_sa_temp = sa if eventual_sa == 1
replace new_sa_temp = 0 if new_sa_temp == . //this helps for the max

order sum_kwh max_sum new_sa_temp

//BIG NOTE - want to only do for multiple collapsing accouts, since others retain original SA
egen double sa_new = max(new_sa_temp) if multiple == 1, by(group_dup)
replace sa_new = sa if multiple == 0
order sa_new sa
format sa* new_sa_temp %15.0g

drop new_sa_temp eventual_sa sum_kwh max_sum_kwh
compress *

save "temp/temp333", replace

use "temp/temp333", clear

preserve
keep if date == mdy(8,18,2015)
keep if hour == 17
keep sa_new sa multiple share many group_dup
duplicates drop
save "stata data/said collapsing guide for multiple meters", replace //used in step 5

restore



drop kwh  sa spid apct ln_kwh
gduplicates drop


save "stata data/end_of_step_4_data", replace



